Part 1 - Exploring the data

1 Inspecting the summary statistics

data_Sleep_Fatigue_AlcoholUse <- read_csv('df_Sleep_Fatigue_AlcoholUse.csv', show_col_types = FALSE) 
## New names:
## • `` -> `...1`
summary(data_Sleep_Fatigue_AlcoholUse)
##       ...1              X                ID             TSC       
##  Min.   :  1.00   Min.   :  1.00   Min.   :101.0   Min.   :1.850  
##  1st Qu.: 43.75   1st Qu.: 43.75   1st Qu.:143.8   1st Qu.:2.400  
##  Median : 86.50   Median : 86.50   Median :186.5   Median :2.850  
##  Mean   : 86.50   Mean   : 86.50   Mean   :186.5   Mean   :2.831  
##  3rd Qu.:129.25   3rd Qu.:129.25   3rd Qu.:229.2   3rd Qu.:3.150  
##  Max.   :172.00   Max.   :172.00   Max.   :272.0   Max.   :4.310  
##                                                    NA's   :10     
##       FSS       CIS_Fatigue_severity CIS_Concentration CIS_Motivation
##  Min.   :1.00   Min.   :1.500        Min.   :1.600     Min.   :1.00  
##  1st Qu.:3.00   1st Qu.:2.880        1st Qu.:3.000     1st Qu.:2.50  
##  Median :3.67   Median :3.815        Median :4.000     Median :3.00  
##  Mean   :3.66   Mean   :3.856        Mean   :3.926     Mean   :3.21  
##  3rd Qu.:4.22   3rd Qu.:4.620        3rd Qu.:4.600     3rd Qu.:3.75  
##  Max.   :7.00   Max.   :7.000        Max.   :6.800     Max.   :6.00  
##  NA's   :13     NA's   :10           NA's   :10        NA's   :10    
##   CIS_Activity   PSQI_component1 PSQI_component2 PSQI_component3 
##  Min.   :1.000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.330   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :3.000   Median :1.000   Median :1.000   Median :0.0000  
##  Mean   :3.259   Mean   :1.107   Mean   :1.261   Mean   :0.5101  
##  3rd Qu.:4.330   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :6.330   Max.   :2.000   Max.   :3.000   Max.   :3.0000  
##  NA's   :10      NA's   :23      NA's   :11      NA's   :23      
##  PSQI_component4  PSQI_component5 PSQI_component6  PSQI_component7
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.0000   Median :1.000   Median :0.0000   Median :1.000  
##  Mean   :0.4362   Mean   :1.112   Mean   :0.1544   Mean   :1.422  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:2.000  
##  Max.   :3.0000   Max.   :2.000   Max.   :3.0000   Max.   :3.000  
##  NA's   :23       NA's   :11      NA's   :23       NA's   :11     
##  PSQI_GlobalScore   MCTQ_MSFsc    MCTQ_SocialJetlag  AUDIT_Score    
##  Min.   : 1.000   Min.   :2.440   Min.   :0.000     Min.   : 0.000  
##  1st Qu.: 4.000   1st Qu.:4.407   1st Qu.:0.980     1st Qu.: 5.000  
##  Median : 6.000   Median :5.105   Median :1.520     Median : 9.000  
##  Mean   : 5.906   Mean   :5.113   Mean   :1.555     Mean   : 8.886  
##  3rd Qu.: 7.000   3rd Qu.:5.870   3rd Qu.:2.010     3rd Qu.:12.000  
##  Max.   :15.000   Max.   :8.080   Max.   :4.500     Max.   :25.000  
##  NA's   :23       NA's   :32      NA's   :32        NA's   :23
filtered_data <- data.frame( 'Trait self-control' = data_Sleep_Fatigue_AlcoholUse$TSC,
                            'Concentration (CIS)' = data_Sleep_Fatigue_AlcoholUse$CIS_Concentration, 
                            'Disturbances in sleep quality (PSQI)' = data_Sleep_Fatigue_AlcoholUse$PSQI_GlobalScore, 
                            'Chronotype (MCTQ)' = data_Sleep_Fatigue_AlcoholUse$MCTQ_MSFsc, 
                            'Alcohol usage (AUDIT)' =  data_Sleep_Fatigue_AlcoholUse$AUDIT_Score)

colnames(filtered_data) <- c('Trait self-control', 'Concentration (CIS)', 'Disturbances in sleep quality (PSQI)', 'Chronotype (MCTQ)', 'Alcohol usage (AUDIT)')

description_filtered_data <- filtered_data  %>%
  describe(fast = TRUE)

shown_columns <- c('n', 'mean', 'sd', 'min', 'max')
description_filtered_data <- description_filtered_data[shown_columns]

colnames(description_filtered_data) <- c('*n*', '*mean*', '*sd*', '*min*', '*max*')

kable(description_filtered_data,
      digits = 2,
      caption = 'Table 1. Descriptive Statistics') |>
  kable_classic() |>
  kable_styling(full_width = FALSE, font_size = 20)
Table 1. Descriptive Statistics
n mean sd min max
Trait self-control 162 2.83 0.52 1.85 4.31
Concentration (CIS) 162 3.93 1.12 1.60 6.80
Disturbances in sleep quality (PSQI) 149 5.91 2.43 1.00 15.00
Chronotype (MCTQ) 140 5.11 1.08 2.44 8.08
Alcohol usage (AUDIT) 149 8.89 5.23 0.00 25.00

2 Inspecting the variance

create_hist <- function(x, data, label, fill_color, min_value, max_value) {
  plot <- ggplot(data, aes(x = .data[[x]])) + 
    geom_histogram(fill=fill_color, bins=10) +
    xlim(min_value, max_value) +
    xlab(label) + 
    ylab('Frequency')
  return(plot)
}

filtered_data <- na.omit(filtered_data)

plots_step_2 = list()
for (column in colnames(filtered_data)) {
  newplot = create_hist(column, filtered_data, column, "red", min(filtered_data[column]), max(filtered_data[column]))
  plots_step_2 <- append(plots_step_2, list(newplot))
}

figure1 <- ggarrange(plotlist = plots_step_2)
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
## Removed 2 rows containing missing values (`geom_bar()`).
## Removed 2 rows containing missing values (`geom_bar()`).
## Removed 2 rows containing missing values (`geom_bar()`).
## Removed 2 rows containing missing values (`geom_bar()`).
print(figure1)

3 Inspecting the central tendency

#we need to use the lowerbound for the graph, because it is lower than the minimum value
sd_2 <- sd(filtered_data[[1]])*2
mean <- mean(filtered_data[[1]])
lowerbound <- (mean - sd_2)
upperbound <- (mean + sd_2)

figure2 <- create_hist('Trait self-control', filtered_data, 'Trait self-control', 'red', lowerbound, max(filtered_data[[1]])) +
  geom_vline(xintercept = mean(filtered_data[[1]]), color = 'black') +
  ggtitle('Figure 2')
mean_rounded <- round(mean(filtered_data[[1]]),2)

figure3 <- figure2 +
  annotate('text', label = paste('mean = ', mean_rounded), x = 3.10, y = 21) +
  ggtitle('Figure 3')

print(figure3)
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

4 Detecting potential outliers

outliers <- data_Sleep_Fatigue_AlcoholUse %>%
  filter(TSC > upperbound | TSC < lowerbound)

ID_outliers <- outliers$ID


if (length(ID_outliers)<1){
  print('There are no outliers')
  print(paste('The maximum is',max(filtered_data[[1]])))
  print(paste('The minimum is', min(filtered_data[[1]])))
} else {
  print(paste('outlier:', ID_outliers))
}
## [1] "outlier: 142" "outlier: 217"
figure4 <- 
  figure3 +
  geom_vline(xintercept = upperbound) +
  geom_vline(xintercept = lowerbound) +
  ggtitle('Figure 5')
figure5 <- ggplotly(figure4)

figure5
figure6 <- ggplot() +
  geom_boxplot(aes(data_Sleep_Fatigue_AlcoholUse$TSC))+
  xlab('Trait self-control') +
  ylab('frequency') +
  ggtitle('Figure 6')

print(figure6)
## Warning: Removed 10 rows containing non-finite values (`stat_boxplot()`).

The previous conclusion changes with respect to outliers, since the boxplot only visualizes a single outlier, compared to the two determined using a difference of twice the standard deviation.

5 Inspecting covariance

corr <- cor(filtered_data)
figure7 <- ggcorrplot(corr, hc.order = TRUE, lab = TRUE)

print(figure7)